library(tidyverse)
library(lubridate)
library(splitstackshape)
library(scales)
library(grid)

timings <- read_csv("patient_timeline.csv")

tip.renaming <- read_csv("new_tip_names.csv")

constraints <- read_csv("sample_dates.csv") %>%
  mutate(samplingDate = parse_date(samplingDate, format = "%m/%d/%Y")) %>%
  mutate(date.constraint = map_dbl(samplingDate, function(x) decimal_date(x))) %>% 
  rename("old.tip" = "sequenceName")

constraints$new.tip <- tip.renaming$new.label[match(constraints$old.tip, tip.renaming$tip.name)]
constraints$new.tip[is.na(constraints$new.tip)] <- constraints$old.tip[is.na(constraints$new.tip)]

timings$split.count <- sapply(timings$study_no, function(x){
  if(!(x %in% tip.renaming$host)){
    return(1)
  } else {
    return(length(unique(tip.renaming$new.host[which(tip.renaming$host==x)])))
  }
})

timings.expanded <- expandRows(timings, "split.count")

timings.expanded$split.host <-timings.expanded$study_no

for(host in unique(timings.expanded$study_no)){
  rows <- which(timings.expanded$study_no==host)
  if(length(rows)>1){
    timings.expanded$split.host[rows] <- paste0(timings.expanded$split.host[rows], letters[1:length(rows)])
  }
}

timings <- timings.expanded

first.appearance <- sapply(unique(timings$split.host), function(x){
  rows <- which(startsWith(constraints$new.tip, x))
  min(constraints$date.constraint[rows] )
})

splits <- read_csv("splits_on_consensus_tree.csv")

splits$tip.dates <- constraints$date.constraint[match(splits$tip, constraints$new.tip)] 

# parse failure here are OK

splits$tip.dates[which(splits$host=="T241")] <- 2008.391

splits.2 <- splits[!is.na(splits$tip.dates),]

split.times <- splits.2 %>%
  group_by(subgraph) %>%
  summarise(first.samp = min(tip.dates), last.sample = max(tip.dates))
  
split.times$host <- sapply(split.times$subgraph, function(x) unlist(strsplit(x, "-"))[1])

hospital.stays.1 <- timings[,c("split.host", "study_no", "ward", "hosp_adm_d_1", "hosp_disc_d_1")]
hospital.stays.1$hosp_adm_d_1 <- parse_date_time(hospital.stays.1$hosp_adm_d_1, "%d%m%Y")
hospital.stays.1$hosp_adm_d_1 <- decimal_date(hospital.stays.1$hosp_adm_d_1) 
hospital.stays.1$hosp_disc_d_1 <- parse_date_time(hospital.stays.1$hosp_disc_d_1, "%d%m%Y")
hospital.stays.1$hosp_disc_d_1 <- decimal_date(hospital.stays.1$hosp_disc_d_1)

hospital.stays.2 <- timings[!is.na(timings$hosp_adm_d_2) ,c("split.host", "study_no", "ward", "hosp_adm_d_2", "hosp_disc_d_2")]
hospital.stays.2$hosp_adm_d_2 <- parse_date_time(hospital.stays.2$hosp_adm_d_2, "%d%m%Y")
hospital.stays.2$hosp_adm_d_2 <- decimal_date(hospital.stays.2$hosp_adm_d_2) 
hospital.stays.2$hosp_disc_d_2 <- parse_date_time(hospital.stays.2$hosp_disc_d_2, "%d%m%Y")
hospital.stays.2$hosp_disc_d_2 <- decimal_date(hospital.stays.2$hosp_disc_d_2)

colnames(hospital.stays.1) <- c("split.host", "host", "ward", "hosp_adm_d", "hosp_disc_d")
colnames(hospital.stays.2) <- c("split.host", "host", "ward", "hosp_adm_d", "hosp_disc_d")

hospital.stays <- rbind(hospital.stays.1, hospital.stays.2)

nurses <- c("T040","T056","T059","T241","T353")

nurses.starts <- decimal_date(parse_date_time("02/12/2007", "d/m/Y"))
nurses.ends <- decimal_date(parse_date_time("30/06/2008", "d/m/Y"))


nurses.df <- data.frame(split.host = nurses, host = nurses, ward="hcw", hosp_adm_d=nurses.starts, hosp_disc_d=nurses.ends)

hospital.stays <- rbind(hospital.stays, nurses.df)

hcw.nos <- hospital.stays$split.host[which(hospital.stays$ward=="hcw")]
paed.nos <- hospital.stays$split.host[which(hospital.stays$ward=="PICU 2")]
adult.nos <- hospital.stays$split.host[which(hospital.stays$ward=="surg")]
desired.order.split <- c(unique(hcw.nos), unique(paed.nos), unique(adult.nos))

hcw.nos <- hospital.stays$host[which(hospital.stays$ward=="hcw")]
paed.nos <- hospital.stays$host[which(hospital.stays$ward=="PICU 2")]
adult.nos <- hospital.stays$host[which(hospital.stays$ward=="surg")]
desired.order.notsplit <- c(unique(hcw.nos), unique(paed.nos), unique(adult.nos))

hospital.stays$colo <- sapply(hospital.stays$split.host, function(x) gsub("T", "C", x))
hospital.stays$split.host <- factor(hospital.stays$split.host, levels=desired.order.split)
hospital.stays$colo <- factor(hospital.stays$colo, levels= sapply(desired.order.split, function(x) gsub("T", "C", x)))
hospital.stays$host <- factor(hospital.stays$host, levels=desired.order.notsplit)

icu.stays.1 <- timings[,c("split.host", "study_no", "ward", "icu_adm_d_1", "icu_disc_d_1")]
icu.stays.1$icu_adm_d_1 <- parse_date_time(icu.stays.1$icu_adm_d_1, "%d%m%Y")
icu.stays.1$icu_adm_d_1 <- decimal_date(icu.stays.1$icu_adm_d_1) 
icu.stays.1$icu_disc_d_1 <- parse_date_time(icu.stays.1$icu_disc_d_1, "%d%m%Y")
icu.stays.1$icu_disc_d_1 <- decimal_date(icu.stays.1$icu_disc_d_1) 

icu.stays.2 <- timings[!is.na(timings$icu_adm_d_2), c("split.host", "study_no", "ward", "icu_adm_d_2", "icu_disc_d_2")]
icu.stays.2$icu_adm_d_2 <- parse_date_time(icu.stays.2$icu_adm_d_2, "%d%m%Y")
icu.stays.2$icu_adm_d_2 <- decimal_date(icu.stays.2$icu_adm_d_2) 
icu.stays.2$icu_disc_d_2 <- parse_date_time(icu.stays.2$icu_disc_d_2, "%d%m%Y")
icu.stays.2$icu_disc_d_2 <- decimal_date(icu.stays.2$icu_disc_d_2) 

icu.stays.3 <- timings[!is.na(timings$icu_adm_d_3), c("split.host", "study_no", "ward", "icu_adm_d_3", "icu_disc_d_3")]
icu.stays.3$icu_adm_d_3 <- parse_date_time(icu.stays.3$icu_adm_d_3, "%d%m%Y")
icu.stays.3$icu_adm_d_3 <- decimal_date(icu.stays.3$icu_adm_d_3) 
icu.stays.3$icu_disc_d_3 <- parse_date_time(icu.stays.3$icu_disc_d_3, "%d%m%Y")
icu.stays.3$icu_disc_d_3 <- decimal_date(icu.stays.3$icu_disc_d_3) 

icu.stays.4 <- timings[!is.na(timings$icu_adm_d_4), c("split.host", "study_no", "ward", "icu_adm_d_4", "icu_disc_d_4")]
icu.stays.4$icu_adm_d_4 <- parse_date_time(icu.stays.4$icu_adm_d_4, "%d%m%Y")
icu.stays.4$icu_adm_d_4 <- decimal_date(icu.stays.4$icu_adm_d_4) 
icu.stays.4$icu_disc_d_4 <- parse_date_time(icu.stays.4$icu_disc_d_4, "%d%m%Y")
icu.stays.4$icu_disc_d_4 <- decimal_date(icu.stays.4$icu_disc_d_4) 

colnames(icu.stays.1) <- c("split.host", "host", "ward", "icu_adm_d", "icu_disc_d")
colnames(icu.stays.2) <- c("split.host", "host", "ward", "icu_adm_d", "icu_disc_d")
colnames(icu.stays.3) <- c("split.host", "host", "ward", "icu_adm_d", "icu_disc_d")
colnames(icu.stays.4) <- c("split.host", "host", "ward", "icu_adm_d", "icu_disc_d")

icu.stays <- rbind(icu.stays.1, icu.stays.2, icu.stays.3, icu.stays.4)

icu.stays$colo <- sapply(icu.stays$split.host, function(x) gsub("T", "C", x))
icu.stays$split.host <- factor(icu.stays$split.host, levels=desired.order.split)
icu.stays$colo <- factor(icu.stays$colo, levels= sapply(desired.order.split, function(x) gsub("T", "C", x)))
icu.stays$host <- factor(icu.stays$host, levels=desired.order.notsplit)

collapsed.tree <- read_csv("collapsed_tree_on_consensus_tree.csv")
collapsed.tree <- collapsed.tree[startsWith(collapsed.tree$hosts, "T"),]

collapsed.tree$latest.time <- split.times$first.samp[match(collapsed.tree$unique.splits, split.times$subgraph)]
collapsed.tree$p.latest.time <- split.times$first.samp[match(collapsed.tree$parent.splits, split.times$subgraph)]
collapsed.tree <- collapsed.tree[which(collapsed.tree$parent.hosts!="unassigned_region"),]

collapsed.tree$colo <- sapply(collapsed.tree$hosts, function(x) gsub("T", "C", x))
collapsed.tree$hosts <- factor(collapsed.tree$hosts, levels=desired.order.split)
collapsed.tree$colo <- factor(collapsed.tree$colo, levels= sapply(desired.order.split, function(x) gsub("T", "C", x)))
collapsed.tree$par.colo <- sapply(collapsed.tree$parent.hosts, function(x) gsub("T", "C", x))
collapsed.tree$parent.hosts <- factor(collapsed.tree$parent.hosts, levels=desired.order.split)
collapsed.tree$par.colo <- factor(collapsed.tree$par.colo, levels= sapply(desired.order.split, function(x) gsub("T", "C", x)))

collapsed.tree$orig.hosts <- substr(collapsed.tree$hosts, 1, 4)
collapsed.tree$orig.parent.hosts <- substr(collapsed.tree$parent.hosts, 1, 4)

collapsed.tree$orig.hosts <- factor(collapsed.tree$orig.hosts, levels=desired.order.notsplit)
collapsed.tree$orig.parent.hosts <- factor(collapsed.tree$orig.parent.hosts, levels=desired.order.notsplit)

splits.2$ward <- hospital.stays$ward[match(splits.2$host, hospital.stays$split.host)]
splits.2$orig.host <- substr(splits.2$host, 1, 4)

splits.2$colo <- sapply(splits.2$host, function(x) gsub("T", "C", x))

splits.2$host <- factor(splits.2$host, levels=desired.order.split)
splits.2$colo <- factor(splits.2$colo, levels=sapply(desired.order.split, function(x) gsub("T", "C", x)))


splits.2$orig.host <- factor(splits.2$orig.host, levels=desired.order.notsplit)

splits.3 <- splits.2[,c(1,4,5,6,7,8)] %>%
  group_by(host, tip.dates, ward, orig.host, colo) %>%
  summarise(reads = sum(reads)) %>%
  ungroup()

splits.3$positive <- T

exam.dates.adult <- read_csv("Swab_adults.csv")
exam.dates.adult$Sample_date1 <- decimal_date(parse_date_time(exam.dates.adult$Sample_date1, "%d-%m-%y"))
exam.dates.adult$Id <- sapply(exam.dates.adult$Id, function(x) {
  while(nchar(x)<3){
    x <- paste0(0, x)
  }
  x <- paste0("T", x)
  x
})

exam.dates.child <- read_csv("Swab_children.csv")
exam.dates.child$Sample_date1 <- decimal_date(parse_date_time(exam.dates.child$Sample_date1, "%d-%m-%y"))
exam.dates.child$Id <- sapply(exam.dates.child$Id, function(x) {
  while(nchar(x)<3){
    x <- paste0(0, x)
  }
  x <- paste0("T", x)
  x
})


exam.dates.nurse <- read_csv("Swab_nurses.csv")
exam.dates.nurse$Sample_date1 <- decimal_date(parse_date_time(exam.dates.nurse$Sample_date1, "%d-%m-%y"))
exam.dates.nurse$Id <- sapply(exam.dates.nurse$Id, function(x) {
  while(nchar(x)<3){
    x <- paste0(0, x)
  }
  x <- paste0("T", x)
  x
})

exam.dates <- rbind(exam.dates.adult, exam.dates.child, exam.dates.nurse)
exam.dates <- exam.dates[,c("Id", "Sample_date1")]
exam.dates <- unique(exam.dates)
exam.dates$split.id <- exam.dates$Id
doubled.hosts <- c("T035", "T159", "T099", "T271", "T327")
for(dh in doubled.hosts){
  old.rows <- which(exam.dates$Id==dh)
  copy <- exam.dates[old.rows,]
  exam.dates$split.id[old.rows] <- paste0(exam.dates$split.id[old.rows],"a")
  copy$split.id <- paste0(copy$split.id,"b")
  exam.dates <- rbind(exam.dates, copy)
}
exam.dates <- exam.dates[order(exam.dates$split.id),]
for(row.no in 1:nrow(exam.dates)){
  split.host <- exam.dates$split.id[row.no]
  colo <- gsub("T", "C", exam.dates$split.id[row.no])
  unsplit.host <- exam.dates$Id[row.no]
  splits.rows <- splits.3[which(splits.3$host==split.host & splits.3$positive == T),]
  if(nrow(splits.rows)>0){
    location <- unique(splits.rows$ward)
    if(length(location)!=1){
      cat("Help! ",row.no,"\n")
      break
    }
    time <- exam.dates$Sample_date1[row.no]
    if(!(time %in% splits.rows$tip.dates)){
      new.row <- c(split.host, time, location, unsplit.host, colo, 0, F)
      splits.3 <- add_row(splits.3, host = split.host, tip.dates = time, ward = location, orig.host = unsplit.host, colo = colo, reads = 0, positive = F)
    }
  }
}

splits.3$tip.dates <- as.numeric(splits.3$tip.dates)
splits.3$host <- factor(splits.3$host, levels=desired.order.split)
splits.3$host <- factor(splits.3$colo, levels=sapply(desired.order.split, function(x) gsub("T", "C", x)))

al.colours <- c(rep(seq_gradient_pal("lightblue", "darkblue")(seq(0, 1, length.out = 3)[2]), 5), 
                rep(seq_gradient_pal("pink", "darkred")(seq(0, 1, length.out = 3)[2]), 31), 
                rep(seq_gradient_pal("lightgreen", "darkgreen")(seq(0, 1, length.out = 3)[2]), 24))

collapsed.tree.2 <- unique(collapsed.tree[,c(3,4,5,6,7,8,9, 10)])

dec.1.2007 <- decimal_date(parse_date_time("01/12/2007", "%d%m%y"))
jan.1.2008 <- decimal_date(parse_date_time("01/01/2008", "%d%m%y"))
feb.1.2008 <- decimal_date(parse_date_time("01/02/2008", "%d%m%y"))
mar.1.2008 <- decimal_date(parse_date_time("01/03/2008", "%d%m%y"))
apr.1.2008 <- decimal_date(parse_date_time("01/04/2008", "%d%m%y"))
may.1.2008 <- decimal_date(parse_date_time("01/05/2008", "%d%m%y"))
jun.1.2008 <- decimal_date(parse_date_time("01/06/2008", "%d%m%y"))
jul.1.2008 <- decimal_date(parse_date_time("01/07/2008", "%d%m%y"))

month.starts <- c(dec.1.2007, jan.1.2008, feb.1.2008, mar.1.2008, apr.1.2008, may.1.2008, jun.1.2008, jul.1.2008)
month.middles <- month.starts[1:(length(month.starts) -1)] + diff(month.starts)/2
month.labels <- c("Dec 2007", "Jan 2008", "Feb 2008", "Mar 2008", "Apr 2008", "May 2008", "Jun 2008")

single.sampled <- read_csv("trace_colonisation_patients.csv") %>% pull(id)
single.sampled <- single.sampled[which(startsWith(single.sampled, "T"))]

single.sampled.C <- paste0("C", substr(single.sampled, 2, nchar(single.sampled)))

pale <- desired.order.split %in% single.sampled

row.colours <- c(rep("darkblue", 5), rep("darkred", 31), rep("darkgreen", 24))

row.colours <- sapply(1:length(row.colours), function(x){
  if(pale[x]){
    if(x<=5){
      "lightblue"
    } else if(x<=36){
      "pink"
    } else {
      "lightgreen"
    }
  } else {
    row.colours[x]
  }
})


t.image <- ggplot() +  
  geom_segment(data=hospital.stays, aes(x=hosp_adm_d - (0.5/366), xend=hosp_disc_d + (0.5/366), y=colo, yend=colo, col=split.host), size=0.75 ) +
  geom_segment(data=icu.stays, aes(x=icu_adm_d - (0.5/366), xend=icu_disc_d + (0.5/366), y=colo, yend=colo, col=split.host), size=1.5) +
  annotate("segment", x = min(splits.3$tip.dates) - (0.5/366), xend = min(splits.3$tip.dates) - (0.5/366), y = 0, yend = 61, alpha = .5, linetype="longdash") +
  annotate("segment", x = max(splits.3$tip.dates) + (0.5/366), xend = max(splits.3$tip.dates) + (0.5/366), y = 0, yend = 61, alpha = .5, linetype="longdash") +
  geom_point(data = splits.3[which(splits.3$positive==T),], aes(x=tip.dates, y=colo), shape=21, size=2, stroke=1,  colour="black") +
  geom_point(data = splits.3[which(splits.3$positive==F),], aes(x=tip.dates, y=colo), shape=4, size=2, stroke=0.5, colour="black") +
  scale_colour_manual(values = row.colours) +
  scale_x_continuous(breaks = c(dec.1.2007, jan.1.2008, feb.1.2008, mar.1.2008, apr.1.2008, may.1.2008, jun.1.2008, jul.1.2008), 
                     minor_breaks=NULL, 
                     limits = c(dec.1.2007, jul.1.2008), 
                     expand=c(0,0)) +
  geom_curve(data = collapsed.tree.2, aes(x=latest.time, xend=latest.time, y=par.colo, yend=colo), angle=135, col="gray25", alpha=0.75, arrow = arrow(length = unit(0.01, "npc")), curvature=0.3) +
  geom_point(data = collapsed.tree.2, aes(x=latest.time, y=par.colo),size=0.75, stroke=0.5, fill="white", shape=23, col="gray25", alpha=0.75) +
  theme_bw() +
  theme(legend.position="none") +
  xlab("Month") + 
  ylab("Colonisation") + 
  theme(axis.text.y = element_text(colour = al.colours), 
        axis.text.x = element_blank(), 
        axis.title.x = element_text(margin = margin(t=18)))

for(month.no in 1:(length(month.starts) -1)){
  extra.text <-   textGrob(month.labels[month.no], gp = gpar(fontsize=9))
  
  t.image <- t.image + annotation_custom(extra.text, xmin = month.starts[month.no], xmax = month.starts[month.no + 1], ymin = -2, ymax = -0.5)
}

g <- ggplotGrob(t.image)
g$layout$clip[g$layout$name=="panel"] <- "off"
pdf("Figure4S1.pdf", width=15, height=7)

grid.draw(g)

dev.off()


reduced.hospital.stays <- hospital.stays[which(!(hospital.stays$split.host %in% single.sampled)),]
reduced.hospital.stays$split.host <- droplevels(reduced.hospital.stays$split.host)
reduced.hospital.stays$colo <- droplevels(reduced.hospital.stays$colo)
reduced.icu.stays <- icu.stays[which(!(icu.stays$split.host %in% single.sampled)),]
reduced.icu.stays$split.host <- droplevels(reduced.icu.stays$split.host)
reduced.icu.stays$colo <- droplevels(reduced.icu.stays$colo)
reduced.row.colours <- row.colours[which(!(pale))]
reduced.splits.3 <- splits.3[which(!(splits.3$host %in% single.sampled.C)),]
reduced.splits.3$colo <- droplevels(reduced.splits.3$colo)

reduced.al.colours <- c(rep(seq_gradient_pal("lightblue", "darkblue")(seq(0, 1, length.out = 3)[2]), 1), 
                        rep(seq_gradient_pal("pink", "darkred")(seq(0, 1, length.out = 3)[2]), 15), 
                        rep(seq_gradient_pal("lightgreen", "darkgreen")(seq(0, 1, length.out = 3)[2]), 11))

reduced.collapsed.tree.2 <- collapsed.tree.2[which(!(collapsed.tree.2$colo %in% single.sampled.C)& !(collapsed.tree.2$par.colo %in% single.sampled.C)),]
reduced.collapsed.tree.2$colo <- droplevels(reduced.collapsed.tree.2$colo)
reduced.collapsed.tree.2$colo <- droplevels(reduced.collapsed.tree.2$colo)



t2.image <- ggplot() +  
 geom_segment(data=reduced.hospital.stays, aes(x=hosp_adm_d - (0.5/366), xend=hosp_disc_d + (0.5/366), y=colo, yend=colo, col=split.host), size=0.75 ) +
  geom_segment(data=reduced.icu.stays, aes(x=icu_adm_d - (0.5/366), xend=icu_disc_d + (0.5/366), y=colo, yend=colo, col=split.host), size=1.5) +
  annotate("segment", x = min(reduced.splits.3$tip.dates) - (0.5/366), xend = min(reduced.splits.3$tip.dates) - (0.5/366), y = 0, yend = 28, alpha = .5, linetype="longdash") +
  annotate("segment", x = max(reduced.splits.3$tip.dates) + (0.5/366), xend = max(reduced.splits.3$tip.dates) + (0.5/366), y = 0, yend = 28, alpha = .5, linetype="longdash") +
  geom_point(data = reduced.splits.3[which(reduced.splits.3$positive==T),], aes(x=tip.dates, y=colo), shape=21, size=2, stroke=1,  colour="black") +
  geom_point(data = reduced.splits.3[which(reduced.splits.3$positive==F),], aes(x=tip.dates, y=colo), shape=4, size=2, stroke=0.5, colour="black") +
  scale_colour_manual(values = reduced.row.colours) +
  scale_x_continuous(breaks = c(dec.1.2007, jan.1.2008, feb.1.2008, mar.1.2008, apr.1.2008, may.1.2008, jun.1.2008, jul.1.2008), 
                     minor_breaks=NULL, 
                     limits = c(dec.1.2007, jul.1.2008), 
                     expand=c(0,0)) +
  geom_curve(data = reduced.collapsed.tree.2, aes(x=latest.time, xend=latest.time, y=par.colo, yend=colo), angle=20, col="gray25", alpha=0.75, arrow = arrow(length = unit(0.025, "npc")), curvature=0.5) +
  geom_point(data = reduced.collapsed.tree.2, aes(x=latest.time, y=par.colo),size=0.75, stroke=0.5, fill="white", shape=23, col="gray25", alpha=0.75) +
  theme_bw() +
  theme(legend.position="none") +
  xlab("Month") + 
  ylab("Colonisation") + 
  theme(axis.text.y = element_text(colour = reduced.al.colours), 
        axis.text.x = element_blank(), 
        axis.title.x = element_text(margin = margin(t=18)))

for(month.no in 1:(length(month.starts) -1)){
  extra.text <-   textGrob(month.labels[month.no], gp = gpar(fontsize=9))
  
  t2.image <- t2.image + annotation_custom(extra.text, xmin = month.starts[month.no], xmax = month.starts[month.no + 1], ymin = -2, ymax = -0.5)
}

g <- ggplotGrob(t2.image)
g$layout$clip[g$layout$name=="panel"] <- "off"

pdf("Figure4.pdf", width=15, height=4.5)

grid.draw(g)

dev.off()
